{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "from nltk.parse import stanford\n",
    "import nltk\n",
    "os.environ['STANFORD_PARSER'] = '/home/shanu/nltk/jars/stanford-parser.jar'\n",
    "os.environ['STANFORD_MODELS'] = '/home/shanu/nltk/jars/stanford-parser-3.7.0-models.jar'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Dependency Tree\n",
    "from nltk.parse.stanford import StanfordDependencyParser\n",
    "dep_parser=StanfordDependencyParser(model_path=\"/home/shanu/nltk/jars/englishPCFG.ser.gz\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def lca(tree, index1, index2):\n",
    "    node = index1\n",
    "    path1 = []\n",
    "    path2 = []\n",
    "    path1.append(index1)\n",
    "    path2.append(index2)\n",
    "    while(node != tree.root):\n",
    "        node = tree.nodes[node['head']]\n",
    "        path1.append(node)\n",
    "    node = index2\n",
    "    while(node != tree.root):\n",
    "        node = tree.nodes[node['head']]\n",
    "        path2.append(node)\n",
    "    for l1, l2 in zip(path1[::-1],path2[::-1]):\n",
    "        if(l1==l2):\n",
    "            temp = l1\n",
    "    return temp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def path_lca(tree, node, lca_node):\n",
    "    path = []\n",
    "    path.append(node)\n",
    "    while(node != lca_node):\n",
    "        node = tree.nodes[node['head']]\n",
    "        path.append(node)\n",
    "    return path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def seq(lca):\n",
    "    l=[lca]\n",
    "    for key in tree.nodes[lca]['deps']:\n",
    "        for i in tree.nodes[lca]['deps'][key]:\n",
    "            l.extend(seq(i))\n",
    "    return l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import _pickle \n",
    "f = open('../data/training_data', 'rb')\n",
    "sentences, e1, e2 = _pickle.load(f)\n",
    "f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sentences[7588] = 'The reaction mixture is kept in the dark at room temperature for 1.5 hours .'\n",
    "sentences[2608] = \"This strawberry sauce has about a million uses , is freezer-friendly , and is so much better than that jar of Smuckers strawberry sauce that you 've had sitting in your fridge since that time you made banana splits 1.5 years ago .\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# sentences[2590] = \"The pendant with the bail measure 1.25'' .\"\n",
    "# sentences[2664] = \"The cabinet encloses a 6.5 inch cone woofer , 4 inch cone midrange , and a 0.86 inch balanced dome tweeter .\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "length = len(sentences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "word_p = []\n",
    "rel_p = []\n",
    "pos_p = []\n",
    "for i in range(length):\n",
    "    word_p.append(0)\n",
    "    rel_p.append(0)\n",
    "    pos_p.append(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2590 success [2, 1, 6, 4, 5, 3]\n"
     ]
    }
   ],
   "source": [
    "# for i in range(length):\n",
    "i = 2590\n",
    "try:\n",
    "    parse_tree = dep_parser.raw_parse(sentences[i])\n",
    "    for trees in parse_tree:\n",
    "        tree = trees\n",
    "    node1 = tree.nodes[e1[i]+1]\n",
    "    node2 = tree.nodes[e2[i]+1]\n",
    "    if node1['address']!=None and node2['address']!=None:\n",
    "        lca_node = lca(tree, node1, node2)\n",
    "        path = seq(lca_node['address'])\n",
    "        print(i, \"success\", path)\n",
    "\n",
    "        word_p[i] = [tree.nodes[p][\"word\"] for p in path]\n",
    "        rel_p[i] = [tree.nodes[p][\"rel\"] for p in path]\n",
    "        pos_p[i] = [tree.nodes[p][\"tag\"] for p in path]\n",
    "    else:\n",
    "\n",
    "        print(i, node1[\"address\"], node2[\"address\"])\n",
    "except AssertionError:\n",
    "    print(i, \"error\")\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "file = open('../data/train_lca_paths', 'wb')\n",
    "_pickle.dump([word_p, rel_p, pos_p], file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
